1 Config

Load libraries and functions:

# libraries from CRAN
library(dplyr)
library(tidyr)
library(readr)
library(data.table)
library(glue)
library(DT)
library(stringr)
library(kableExtra)
library(RColorBrewer)
library(ggplot2)
library(cowplot)
library(Seurat)

# custom functions
source(file.path(params$assets, "resources", "functions.R"))

Check if a run with this name already exists, and exit on an error if so:

run <- params$run

# this is to ensure that a) runs are not overwritten by accident, and b)
# the contents of a run reflects what's actually in the config file
if (dir.exists(glue("output/{run}.preprocessing/"))) {
  
  stop("A run with this name, ", run, " already exists. Over-writing ",
       "is not enabled; to create another iteration of the same run, remove ",
       "outputs and figures folders from the previous run with \nrm -rf output/",
       run, ".preprocessing/ \nrm -rf figures/", run, ".preprocessing/",
       "\nrm ", run, ".preprocessing.html \nTo create a different run, use a custom ",
       ".preprocessing.config.tsv with a different prefix.")
  
}

Read in the config file for this run. These parameters can be altered by creating a copy of the default.preprocessing.config.tsv file, with a different prefix.

# if the default keyword is used, the default file *must* be used
if (params$run == "default") {
  
  config_tsv <- read_tsv(file.path(params$assets, "default.preprocessing.config.tsv"))
  message("Since the run name is 'default'; this iteration is run with default ",
          "config file at ",
          file.path(params$assets, "default.preprocessing.config.tsv"),
          ". To create a run with custom config, choose a run name other",
          " than 'default'.")
  
} else { # otherwise, the custom config file will be used
  
  # read in specification & print
  config_tsv <- read_tsv(params$config)
  
}

config_tsv
# coerce to a list of param:value pairs, and make sure numeric parameters are numeric type
config <- as.list(tibble::deframe(config_tsv[, c(1, 2)]))
config[grepl("numeric", config_tsv$description)] <- as.numeric(config[grepl("numeric", config_tsv$description)])

Set up directory structure and confirm current working directory:

# print current working directory
getwd()
## [1] "/lustre03/project/6004736/singlecell_pipeline/data/njabado/10X_sn_chromium_3prim/2019-04-05-PROJECT16541_RUN5022_NJ-scRNAseq-10X_Aug_2018_hg/R092sn/seurat_v3_Marie"
# specify and create output and figures folders with run prefix
output_dir <- glue("output/{params$run}.preprocessing/")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

figures_dir <- paste0(knitr::opts_knit$get("root.dir"), "/figures/", glue("{params$run}.preprocessing"), "/")
dir.create(figures_dir, showWarnings = FALSE, recursive = TRUE)

# save the config to the output
file.copy(params$config, file.path(output_dir, "config.tsv"), overwrite = TRUE)
## [1] TRUE

Set up R markdown:

knitr::opts_chunk$set(
  # Display PNG in HTML file, but also keep PDF outputs in figures folder
  dev = c("png", "pdf"), 
  # Keep all the figures produced in a chunk
  fig.keep = "all",      
  # Save all figures to this output folder
  fig.path = figures_dir,
  # Do not cache results
  cache = FALSE)

# not used
# cache.path = file.path(knitr::opts_knit$get("root.dir"), ".cache/")

# don't use dingbats in PDFs, in order to create Illustrator-friendly figures
grDevices::pdf.options(useDingbats = FALSE)

Specify sample and project:

# get all path and split into a vector based on /
path_vector <- unlist(strsplit(getwd(), split = "/")) 

# get the sample ID, which is the second-to-last element
(sample_id  <- path_vector[length(path_vector) - 1])
## [1] "R092sn"
# get the project ID, which is the third-to-last element
(project_id <- path_vector[length(path_vector) - 2])
## [1] "2019-04-05-PROJECT16541_RUN5022_NJ-scRNAseq-10X_Aug_2018_hg"

2 Initialization and initial filtering

2.1 Load cellranger output and initialize Seurat object

As input, we use the filtered matrix output by cellranger, which distinguishes barcodes representing cells from background.

# if the cellranger directory is not specified, infer it based on the default directory
# names for each technology. Otherwise (only possible in a non-default run),
# use the specified cellranger directory
cellranger_dir <- ifelse(is.na(config$cellranger_dir),
                         switch(params$technology,
                                "10X_sc_chromium_3prim" = "cellranger_count",
                                "10X_sn_chromium_3prim" = "cellranger_count_premrna"),
                         config$cellranger_dir)

# find & load the filtered h5 file in the cellranger directory;
# a wildcard is used because cellranger v2 uses a "filtered_gene_bc_matrix.h5"
# naming convention, while cellranger v3 uses "filtered_feature_bc_matrix.h5"
(h5_file          <- list.files(file.path("..", cellranger_dir),
                                pattern = glob2rx("filtered*.h5"), full.names = TRUE))
## [1] "../cellranger_count_premrna/filtered_feature_bc_matrix.h5"
cellranger_matrix <- Seurat::Read10X_h5(h5_file)

Create the Seurat object, which applies two filters on the data. First, only genes which are detected in at least config$min_cells are retained. No further filtering on genes is performed. Second, only cells with at least config$min_features are retained. Note that an additional filter on number of features detected occurs in the next section, however an initial hard cutoff is used here in order to minimize the data that's processed with Seurat.

seurat_prefilt <- CreateSeuratObject(
  counts       = cellranger_matrix,
  min.cells    = config$min_cells,    # specifies the min number of cells in which each gene must be detected
  min.features = config$min_features, # specifies the min number of genes which must be detected in each cell
  project      = sample_id            # populates the project.name slot in the Seurat object
)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# add the sample ID and cell barcodes to the metadata -- to facilitate later joining
# data from different samples, the cell barcode is prefixed with the sample ID
seurat_prefilt@meta.data$sample.id    <- sample_id
seurat_prefilt@meta.data$cell.barcode <- paste0(sample_id, "_", rownames(seurat_prefilt@meta.data))

2.2 Compute mitochondrial and ribosomal content

For quality control, we assess the mitochondrial content and ribosomal content at the single-cell level, using the proportion of reads which map to mitochondrial genes, or ribosomal protein genes, respectively.

# identify mitochondrial genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
  mito_genes <- grep("^MT-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
} else if (config$species == "m_musculus") {
  mito_genes <- grep("^mt-", rownames(GetAssayData(object = seurat_prefilt)), value = TRUE)
}

# compute, for each cell, the proportion of reads in mitochondrial genes, and add to the metadata
percent_mito <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[mito_genes, ]) /
  Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_mito, "percent.mito")

# identify ribosomal genes, which depends on the species, due to gene name differences
if (config$species == "h_sapiens") {
  ribo_genes <- grepl("^RPS|^RPL|^MRPS|^MRPS", rownames(GetAssayData(object = seurat_prefilt)))
} else if (config$species == "m_musculus") {
  ribo_genes <- grepl("^Rps|^Rpl|^Mrps|^Mrps", rownames(GetAssayData(object = seurat_prefilt)))
}

# compute, for each cell, the proportion of reads in ribsomal genes, and add to the metadata
percent_ribo <- Matrix::colSums(GetAssayData(object = seurat_prefilt)[ribo_genes, ]) /
  Matrix::colSums(GetAssayData(object = seurat_prefilt)) * 100
seurat_prefilt <- AddMetaData(seurat_prefilt, percent_ribo, "percent.ribo")

2.3 Initialize warnings

warnings <- list("LOW_N_CELLS"        = FALSE,
                 "HIGH_MITO"          = FALSE,
                 "MANY_HIGH_MITO"     = FALSE,
                 "HIGH_PROP_FILTERED" = FALSE,
                 "LOW_AVG_UMI"        = FALSE,
                 "CC_ASSSIGNMENT"     = FALSE,
                 "SMALL_CLUSTERS"     = FALSE)

3 QC and filtering

In this section, we first load the cellranger QC metrics and display them here. We then compute four QC metrics for scRNAseq, filter cells based on the distributions of these metrics, and then assess the distributions of these metrics after filtering.

3.1 Generate thresholds

The thresholds used are a combination of hard cutoffs and cutoffs computed based on the distribution of each metric within the sample. In the case of min_features, this allows to set a permissive hard cutoff, and use a more stringent cutoff if the quality of the sample allows. In the case of max_mito, this allows to set a stringent hard cutoff, and use a more permissive cutoff if the quality of the sample necessitates doing so in order to avoid losing the majority of cells.

(thresholds <- data.frame(
  # te minimum number of features will be the greater of:
  # 400, or 2 standard deviations below the mean
  min_features = max(400, round(mean(seurat_prefilt@meta.data$nFeature_RNA) -
                                  2*sd(seurat_prefilt@meta.data$nFeature_RNA))),
  max_features = round(mean(seurat_prefilt@meta.data$nFeature_RNA) +
                         2*sd(seurat_prefilt@meta.data$nFeature_RNA)),
  min_mito     = 0,
  # by default,
  # the max mitochondrial content will be the maximum of:
  # 5%, or 2 standard deviations above the mean
  # the parameter config$max_mito allows to set a hard upper threshold,
  # which takes precedence
  max_mito     = ifelse(!is.na(config$max_mito),
                        config$max_mito,
                        min(25, 
                max(5, round(mean(seurat_prefilt@meta.data$percent.mito) +
                                       2*sd(seurat_prefilt@meta.data$percent.mito))
                        ))
                ),
  # set a max of 0 in case the value 2 standard deviations below the mean
  # is negative
  min_umi      = max(0, round(mean(seurat_prefilt@meta.data$nCount_RNA) -
                                2*sd(seurat_prefilt@meta.data$nCount_RNA))),
  max_umi      = round(mean(seurat_prefilt@meta.data$nCount_RNA) +
                         2*sd(seurat_prefilt@meta.data$nCount_RNA))
))
# given the resulting thresholds, call a function to identify the cells
# which pass all filters, which returns barcodes
keep_cells <- get_cells_to_filter(
  seurat    = seurat_prefilt,
  min_features = thresholds$min_features,
  max_features = thresholds$max_features,
  min_mito  = thresholds$min_mito,
  max_mito  = thresholds$max_mito,
  min_umi   = thresholds$min_umi,
  max_umi   = thresholds$max_umi
)

3.2 QC metrics before filtering

Visualize each QC metric:

# violin plots
plot_grid(VlnPlot(seurat_prefilt, c("nFeature_RNA"), pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("nCount_RNA"), pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("percent.mito"), pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("percent.ribo"), pt.size = -1) +
            theme(legend.position = "none"),
          ncol = 4)

# regenerate with points
plot_grid(VlnPlot(seurat_prefilt, c("nFeature_RNA"), pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("nCount_RNA"), pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("percent.mito"), pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat_prefilt, c("percent.ribo"), pt.size = 0.1) +
            theme(legend.position = "none"),
          ncol = 4)

3.3 Filtering and QC metrics after filtering

Using the barcodes which passed all filters, subset the Seurat objects to generate a filtered object:

# subset the object using the barcodes of the cells to keep
seurat <- subset(seurat_prefilt, cells = keep_cells)

seurat
## An object of class Seurat 
## 22714 features across 1042 samples within 1 assay 
## Active assay: RNA (22714 features, 0 variable features)
seurat_prefilt
## An object of class Seurat 
## 22714 features across 1112 samples within 1 assay 
## Active assay: RNA (22714 features, 0 variable features)

Regenerate the violin plots after filtering:

# violin plots
plot_grid(VlnPlot(seurat, c("nFeature_RNA"), pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("nCount_RNA"),   pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("percent.mito"), pt.size = -1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("percent.ribo"), pt.size = -1) +
            theme(legend.position = "none"),
          ncol = 4)

plot_grid(VlnPlot(seurat, c("nFeature_RNA"), pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("nCount_RNA"),   pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("percent.mito"), pt.size = 0.1) +
            theme(legend.position = "none"),
          VlnPlot(seurat, c("percent.ribo"), pt.size = 0.1) +
            theme(legend.position = "none"),
          ncol = 4)

3.4 Output summary stats and filtering thresholds

For each of the metrics we filtered on, we save the min, max and mean before and after filtering, as well as the lower and upper thresholds used.

filtering_criteria <- c("nFeature_RNA", "nCount_RNA", "percent.mito", "percent.ribo")

# compute summary stats for each metric
filtering_metrics <- sapply(filtering_criteria, function(criterion) {
  
  min_pre   <- round(min(seurat_prefilt@meta.data  %>% pull(criterion)), 2)
  mean_pre  <- mean(seurat_prefilt@meta.data %>% pull(criterion))
  max_pre   <- max(seurat_prefilt@meta.data  %>% pull(criterion))
  sd_pre    <- sd(seurat_prefilt@meta.data   %>% pull(criterion))
  
  min_post  <- min(seurat@meta.data  %>% pull(criterion))
  mean_post <- mean(seurat@meta.data %>% pull(criterion))
  max_post  <- max(seurat@meta.data  %>% pull(criterion))
  sd_post   <- sd(seurat@meta.data   %>% pull(criterion))
  
  return(c("min.preQC"   = min_pre,
           "mean.preQC"  = mean_pre,
           "max.preQC"   = max_pre,
           "sd.preQC"    = sd_pre,
           "min.postQC"  = min_post,
           "mean.postQC" = mean_post,
           "max.postQC"  = max_post,
           "sd.postQC"   = sd_post))
  
})

# round to 2 decimal places
filtering_metrics <- apply(filtering_metrics, 2, round, 2)

# transform into a dataframe
filtering_metrics <- filtering_metrics %>%
  t() %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "criterion") %>%
  dplyr::select(criterion, min.preQC, min.postQC, max.preQC, max.postQC, mean.preQC, mean.postQC, sd.preQC, sd.postQC)

# add thresholds
filtering_metrics$min.threshold <- c(thresholds$min_features,
                                     thresholds$min_umi,
                                     thresholds$min_mito,
                                     # No min threshold used for % ribo
                                     NA)

filtering_metrics$max.threshold <- c(thresholds$max_features,
                                     thresholds$max_umi,
                                     thresholds$max_mito,
                                     # No max threshold used for % ribo
                                     NA)
filtering_metrics
# compute number of cells before and after filtering
N_cells_metrics <- data.frame(
  "N_cells_before" = dim(seurat_prefilt@meta.data)[1],
  "N_cells_after"  = dim(seurat@meta.data)[1]) %>%
  mutate(Prop_kept = round(N_cells_after / N_cells_before, 2))

N_cells_metrics

This report will register warnings if:

  • there are few cells after filtering (<1000)
  • more than 40% of cells were filtered out
  • the max mitochondrial content after filtering is > 5% (indicating a higher threshold needed to be used)
  • the average number of UMIs after filtering is < 2000

The warnings will be output at the end of the report, and saved as a TSV if any warning flags are TRUE.

if (N_cells_metrics$N_cells_after < 1000) warnings$LOW_N_CELLS <- TRUE
if (N_cells_metrics$Prop_kept < 0.6) warnings$HIGH_PROP_FILTERED <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
                      "nCount_RNA", ]$mean.postQC < 2000) warnings$LOW_AVG_UMI <- TRUE
if (filtering_metrics[filtering_metrics$criterion ==
                      "percent.mito", ]$max.postQC > 5) warnings$HIGH_MITO <- TRUE

We also want to register a warning if a significant proportion of cells (e.g. more than 5%) have a really high mito content (e.g. 10%) after filtering.

N_cells_highMito <- dim(seurat@meta.data %>% filter(percent.mito >= 10))[1]
prop_cells_highMito <- N_cells_highMito / N_cells_metrics$N_cells_after

if (prop_cells_highMito >= 0.05) warnings$MANY_HIGH_MITO <- TRUE

4 Normalization and scaling

Normalization is performed using two different methods. SCTransform normalizes UMI data using a variance stabilized transform based on a negative binomial regression model, and simultaneously regresses unwanted sources of variation (by default, number of UMIs and mitochondrial content). The sequence of NormalizeData, FindVariableFeatures, and ScaleData comprise the second method, which scale counts to 10,000 UMIs per cell, log2-transform counts, and then regress out unwanted sources of variation.

The log-normalized values are set as the default assay in the Seurat object, but the SCTransform values are saved in the SCTransform assay of the Seurat object, accessible with seurat[["SCT"]].

# normalization 1: scale counts to 10000 UMIs per cell, and log2-transform the counts
# set the default assay to RNA to run the log-normalization & scaling
seurat <- seurat %>% 
  NormalizeData(normalization.method = "LogNormalize",
                scale.factor         = 10000) %>% 
  # identify variable genes
  FindVariableFeatures(mean.function       = ExpMean,
                       dispersion.function = LogVMR) %>%
  # regress out variables which are sources of unwanted variation, and z-score data
  ScaleData(vars.to.regress = unlist(str_split(config$var_regress, ",")))

# normalization 2: using SCTransform
# this command also identifies variable features and produces scaled values
# I don't want to do this because it generated a problem when merging
# but it might be useful somewhen maybe, so I'll just return all genes
# See: https://github.com/satijalab/seurat/issues/2368
seurat <- seurat %>%
  SCTransform(vars.to.regress = unlist(str_split(config$var_regress, ",")), verbose = FALSE, return.only.var.genes = F)

# choose the normalization method to use for downstream analysis
DefaultAssay(object = seurat) <- switch(config$normalization,
                                        "LogNormalize" = "RNA",
                                        "SCTransform"  = "SCT")

# confirm the default assay set
DefaultAssay(seurat)
## [1] "RNA"

5 Dimensionality reduction and clustering

seurat <- seurat %>%
  # compute PCA, based on scaled data
  RunPCA(pc.genes        = VariableFeatures(.),
         npcs            = config$pcs_compute,
         ndims.print     = 1:5,
         nfeatures.print = 5) %>%
  # compute tSNE embedding, based on retained PCs
  RunTSNE(dims = 1:config$pcs_keep, verbose = FALSE, seed.use = config$seed) %>%
  # compute UMAP embedding, based on retained PCs
  RunUMAP(dims = 1:config$pcs_keep, verbose = FALSE, seed.use = config$seed)
## PC_ 1 
## Positive:  SYTL3, PLXDC2, SAMSN1, KYNU, DOCK8 
## Negative:  CADM2, DPP10, ZFPM2, OPCML, PPP2R2B 
## PC_ 2 
## Positive:  NOTCH4, ELTD1, PTPRB, NOX4, ERG 
## Negative:  ARHGAP11B, CENPF, KIF14, SGOL2, CENPE 
## PC_ 3 
## Positive:  CADPS, DNER, TNC, EEPD1, CD44 
## Negative:  CENPF, SPC25, APOLD1, ARHGAP11B, CENPE 
## PC_ 4 
## Positive:  ITPKB, CD44, DNER, SPARCL1, CRYAB 
## Negative:  OPCML, MEGF11, TNS3, TMEM132B, POLR2F 
## PC_ 5 
## Positive:  PCDH9, DPP10, RNF219-AS1, LRRC4C, TENM4 
## Negative:  SLC2A1, NDRG1, VIM, FTL, ERO1L
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
ElbowPlot(seurat, ndims = config$pcs_compute)

# call a custom function to compute the variance and cumulative variance
# explained by the top 10 PCs
get_variance_explained(seurat, n = 10)
## $percent.var.explained
##  [1] 14.464758  8.788615  7.307805  5.517137  2.927050  2.300847  1.917206
##  [8]  1.490737  1.325753  1.262907
## 
## $cum.var.explained
##  [1] 14.46476 23.25337 30.56118 36.07831 39.00536 41.30621 43.22342 44.71415
##  [9] 46.03991 47.30282

Visualize genes with the highest loadings for the top PCs:

VizDimLoadings(seurat, dims = 1:3, reduction = "pca", ncol = 3)

Perform SNN clustering:

# step 1. identify neighbours
seurat <- FindNeighbors(seurat,
                        reduction = "pca",
                        dims      = 1:config$pcs_keep,
                        verbose   = TRUE,
                        nn.eps    = 0.5)
## Computing nearest neighbor graph
## Computing SNN
# step 2. identify communities at various resolutions
# to repeat the clustering at various resolutions, we create a function
# with the parameters that will be held constant, and call it several time
# with different values for resolution
clustering_fun <- purrr::partial(FindClusters,
                                 seurat,
                                 verbose = FALSE,
                                 n.start = 10,
                                 random.seed = config$seed)

seurat <- clustering_fun(resolution = 0.6)

p1 <- DimPlot(object = seurat, reduction = "tsne") +
  ggtitle("res 0.6")

seurat <- clustering_fun(resolution = 0.8)

p2 <- DimPlot(object = seurat, reduction = "tsne") +
  ggtitle("res 0.8")

seurat <- clustering_fun(resolution = 1)

p3 <- DimPlot(object = seurat, reduction = "tsne") +
  ggtitle("res 1")

seurat <- clustering_fun(resolution = 2)

p4 <- DimPlot(object = seurat, reduction = "tsne") +
  ggtitle("res 2")

seurat <- clustering_fun(resolution = config$clustering_resolution)

p5 <- DimPlot(object = seurat, reduction = "tsne") +
  ggtitle(paste0("chosen resolution: ", config$clustering_resolution))

plot_grid(p1, p2, p3, p4, p5, ncol = 3)

Save the chosen resolution as the clustering assighnment for the object, although the other solutions will be saved in the metadata. We also assign a custom colour palette with colours that are easier to distinguish than the default:

# the normalization method used will dictate the name of the columns
# containing the clustering solutions
# here, the RHS assembles a string, and if it matches a column in the object
# metadata as expected, Seurat will use that column to assign the active/default
# cell identities (see examples in ?Seurat::Idents() for more details
Idents(object = seurat) <- paste0(switch(config$normalization,
                                         "LogNormalize" = "RNA",
                                         "SCTransform"  = "SCT"),
                                  "_snn_res.",
                                  config$clustering_resolution)

# set a new qualitative palette; this saves the palette to seurat@misc$colours
seurat <- set_qual_pal(seurat)

DimPlot(object = seurat, reduction = "tsne", cols = seurat@misc$colours) +
  ggtitle(paste0("res ", config$clustering_resolution))

DimPlot(object = seurat, reduction = "umap", cols = seurat@misc$colours) +
  ggtitle(paste0("res ", config$clustering_resolution))

6 Cell cycle scoring

To score cells for their cell cycle activity, we use three different methods.

First, we compute the mean expression of the cell cycle phase markers from Whitfield et al, 2002.

cell.cycle.genes.whitfield.2002 <- data.table::fread(file.path(
  params$assets,
  "resources",
  "CellCycleGeneList_1134_whitfield_2002_mice_gene_symbols.txt"))

# compute cell cycle scores
cc <- compute_cell_cycle_whitfield(seurat,
                                   species = config$species,
                                   return_scores = TRUE)

# add the scores to the Seurat object metadata
seurat <- AddMetaData(seurat,
                      setNames(cc$g1.s.scores, cc$cell),
                      "G1.S.score_Whitfield")
seurat <- AddMetaData(seurat,
                      setNames(cc$g2.m.scores, cc$cell),
                      "G2.M.score_Whitfield")

Plotting the Whitfield cell cycle phase scores:

compute_cell_cycle_whitfield(seurat, species = config$species, return_scores = FALSE) +
  scale_color_manual(values = seurat@misc$colours) +
  theme_cowplot()

Next, we compute the scores using the method implemented in Seurat:

# a list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat;
# segregate this list into markers of G2/M phase and markers of S phase
s_genes   <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes

seurat <- CellCycleScoring(seurat,
                           s.features   = s_genes,
                           g2m.features = g2m_genes)
## Warning: The following features are not present in the object: FEN1, UHRF1,
## MLF1IP, CASP8AP2, not searching for symbol synonyms
cc.markers <- switch(config$species,
                     "h_sapiens" = c("PCNA", "TOP2A", "MCM6", "MKI67"),
                     "m_musculus" = c("Pcna", "Top2a", "Mcm6", "Mki67"))

RidgePlot(seurat, group.by = "Phase", features = cc.markers, ncol = 2)
## Picking joint bandwidth of 3.29e-06
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 3.28e-06
## Picking joint bandwidth of 3.41e-06

Plot the correlation between the G2/M score from Seurat (y-axis) an the G2/M score based on the Whitfield genes (x-axis), as well as the correlation with TOP2A:

phase_palette <- c("G1" = "#F8766D", "G2M" = "#00BA38", "S" = "#619CFF")

p1 <- FeatureScatter(seurat, "G2.M.score_Whitfield", "G2M.Score",
                     cols = seurat@misc$colours)

p2 <- FeatureScatter(seurat, "G2.M.score_Whitfield", "G2M.Score",
                     group.by = "Phase",
                     cols = phase_palette)

top2a_gene <- switch(config$species,
              "h_sapiens" = "TOP2A",
                "m_musculus" = "Top2a")

if (top2a_gene %in% rownames(seurat)) {

    p3 <- FeatureScatter(seurat, top2a_gene, "G2M.Score",
                     cols = seurat@misc$colours)

    p4 <- FeatureScatter(seurat, top2a_gene, "G2M.Score",
                     group.by = "Phase",
                     cols = phase_palette)

    cowplot::plot_grid(p1, p2, p3, p4, ncol = 4)

} else {

    # Top2a is not detected
    cowplot::plot_grid(p1, p2)

}

# show only cells assigned G2/M
p1 <- FeatureScatter(seurat, "G2.M.score_Whitfield", "G2M.Score",
                     cells = WhichCells(seurat, expression = Phase == "G2M"),
                     cols = seurat@misc$colours)

p2 <- FeatureScatter(seurat, "G2.M.score_Whitfield", "G2M.Score",
                     group.by = "Phase",
                     cols = phase_palette,
                     cells = WhichCells(seurat, expression = Phase == "G2M"))

if (top2a_gene %in% rownames(seurat)) {

    p3 <- FeatureScatter(seurat, top2a_gene, "G2M.Score",
                     cells = WhichCells(seurat, expression = Phase == "G2M"),
                     cols = seurat@misc$colours)

    p4 <- FeatureScatter(seurat, top2a_gene, "G2M.Score",
                     group.by = "Phase",
                     cols = phase_palette,
                     cells = WhichCells(seurat, expression = Phase == "G2M"))

    cowplot::plot_grid(p1, p2, p3, p4, ncol = 4)

} else {

    cowplot::plot_grid(p1, p2)

}

Register a warning if more than half of the cells labelled G2M have low TOP2A expression:

if (top2a_gene %in% rownames(seurat)) {

    # subset to cells assigned as G2M phase
    top2a_expr <- as.matrix(subset(seurat,
                               subset = Phase == "G2M")[["RNA"]][top2a_gene, ])

    # if more than 50% of cells assigned as G2M phase have low TOP2A expression,
    # flag it
    (prop_low_top2a <- (sum(top2a_expr < 1) / length(top2a_expr)))
    if (prop_low_top2a > 0.5) warnings$CC_ASSSIGNMENT <- TRUE

} else {

    # Top2a is not detected
    warnings$CC_ASSSIGNMENT <- TRUE

}

7 Cluster-level QC

Number of cells in each cluster:

table(Idents(object = seurat))
## 
##   0   1   2   3   4   5   6   7   8   9 
## 174 164 160 141 118 105  67  53  45  15

Register a warning of more than a third of the clusters have fewer than 100 cells:

# number of clusters with < 100 cells, divided by the total number of clusters
if (sum(table(Idents(seurat)) < 100) /
    length(levels(Idents(seurat))) > 0.33) warnings$SMALL_CLUSTERS <- TRUE

Visualize the QC metrics and cell cycle scores in the low-dimensional tSNE and UMAP embeddings, to determine whether clustering / dimensionality reduction is being driven by technical factors:

p0 <- DimPlot(object = seurat, reduction = "pca", cols = seurat@misc$colours)
p1 <- DimPlot(object = seurat, reduction = "tsne", cols = seurat@misc$colours)
p2 <- FeaturePlot(object = seurat, reduction = "tsne", features = "nFeature_RNA") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$nFeature_RNA))
p3 <- FeaturePlot(object = seurat, reduction = "tsne", features = "nCount_RNA") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$nCount_RNA))
p4 <- FeaturePlot(object = seurat, reduction = "tsne", features = "percent.mito") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$percent.mito))
p5 <- FeaturePlot(object = seurat, reduction = "tsne", features = "percent.ribo") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$percent.ribo))
p6 <- FeaturePlot(object = seurat, reduction = "tsne", features = "G1.S.score_Whitfield") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$G1.S.score))
p7 <- FeaturePlot(object = seurat, reduction = "tsne", features = "G2.M.score_Whitfield") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$G2.M.score))

plot_grid(p0, p1, p2, p3, p4, p5, p6, p7,
          ncol = 3)

p0 <- DimPlot(object = seurat, reduction = "pca", cols = seurat@misc$colours)
p1 <- DimPlot(object = seurat, reduction = "umap", cols = seurat@misc$colours)
p2 <- FeaturePlot(object = seurat, reduction = "umap", features = "nFeature_RNA") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$nFeature_RNA))
p3 <- FeaturePlot(object = seurat, reduction = "umap", features = "nCount_RNA") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$nCount_RNA))
p4 <- FeaturePlot(object = seurat, reduction = "umap", features = "percent.mito") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$percent.mito))
p5 <- FeaturePlot(object = seurat, reduction = "umap", features = "percent.ribo") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$percent.ribo))
p6 <- FeaturePlot(object = seurat, reduction = "umap", features = "G1.S.score_Whitfield") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$G1.S.score))
p7 <- FeaturePlot(object = seurat, reduction = "umap", features = "G2.M.score_Whitfield") +
  scale_color_gradient2(low = "darkblue", mid = "lightgrey", high = "red",
                        midpoint = mean(seurat@meta.data$G2.M.score))

plot_grid(p0, p1, p2, p3, p4, p5, p6, p7,
          ncol = 3)

Violin plots of the distribution of each QC metric in each cluster, with the number of cells in each cluster indicated above each violin:

# make a dataframe of the number of cells per cluster
clust_df <- data.frame(table(Idents(object = seurat)))
clust_df[1, 2] <- paste0("N=", clust_df[1, 2])
colnames(clust_df) <- c("ident", "N")

vln_fun <- function(criterion) {
  
  # plot the labels at a value slightly below the max, to ensure they're shown
  # within plot limits
  clust_df$y <- max(seurat[[criterion]]) * 0.95
  Seurat::VlnPlot(seurat, criterion, cols = seurat@misc$colours, pt.size = 0.5) +
    theme(legend.position = "none") +
    geom_text(data = clust_df, aes(label = N, x = ident, y = y))
  
}

vln_fun("nFeature_RNA")

vln_fun("nCount_RNA")

vln_fun("percent.mito")

vln_fun("percent.ribo")

vln_fun("G1.S.score_Whitfield")

vln_fun("G2.M.score_Whitfield")

8 Cluster markers

To identify cluster markers for each cluster for the specified resolution (config$clustering_resolution),

cluster_markers <- FindAllMarkers(object = seurat, verbose = FALSE)

# display the top 30 per cluster
cluster_markers %>%
  dplyr::group_by(cluster) %>%
  top_n(n = 30, wt = avg_logFC) %>%
  dplyr::select(cluster, gene, everything()) %>%
  DT::datatable(cluster_markers, filter = "top")
write_tsv(cluster_markers, file.path(output_dir, "cluster_markers.tsv"))
# Display a heatmap of the top 10 per cluster
top10 <- cluster_markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)

DoHeatmap(seurat, features = top10$gene, group.colors = seurat@misc$colours) +
  NoLegend() +
  scale_fill_gradientn(colors = c("#2166AC", "#E5E0DC", "#B2182B"))
## Warning in DoHeatmap(seurat, features = top10$gene, group.colors = seurat@misc
## $colours): The following features were omitted as they were not found in the
## scale.data slot for the RNA assay: SLC39A14, ITM2C, GFAP, MLC1, PEA15, SORCS3,
## PTCH1, DGKG, KCNQ1OT1, KCNMA1, FKBP5, SOX6, PTPRJ, KAZN, LRRC1, LINC00511,
## XYLT1, TPST1, NTM, RFX4, DCLK2, UBB, HNRNPA3, RPL13, HSP90AB1, CLDND1, RPL13A,
## ACTG1, RASGEF1B
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

9 Save outputs

All parameters and filtering metrics are saved both within the seurat object in the seurat@misc slot, as well as in the output directory.

# parameters are saved with the Seurat object
seurat@misc$params$config     <- config
seurat@misc$filtering_metrics <- filtering_metrics
seurat@misc$n_cells           <- N_cells_metrics

# write metrics/thresholds to file
filtering_metrics_out <- filtering_metrics %>%
  gather("metrics_name", "value", -criterion) %>%
  unite("metrics", criterion:metrics_name, sep = "_") %>%
  spread("metrics", "value") %>% 
  bind_cols(N_cells_metrics)

# include the short version of the most recent git repository SHA
# I'm not using a git repository, let's just save the relative path to the Rmd
filtering_metrics_out$seurat_v3_workflow <- "HGG_Seurat_Nov2020/preprocessing.Rmd"
write_tsv(filtering_metrics_out, path = file.path(output_dir, "seurat_metrics.tsv"))
save(seurat, file = file.path(output_dir, "seurat.Rda"))

10 Warnings

Summary of possible warnings, with issues uncovered in this analysis indicated in red:

# convert the list to a data frame
warnings_df <- data.frame(WARNING = names(warnings),
                          FLAG    = unname(unlist(warnings)))

# if there are any warnings, produce a file
if (any(warnings)) write_tsv(warnings_df %>% filter(FLAG), file.path(output_dir, "warnings.tsv"))

# generate a visual summary, colouring the warnings in red
warnings_df$FLAG = cell_spec(warnings_df$FLAG, background = ifelse(warnings_df$FLAG, "red", "green"))

warnings_df %>% 
  kbl(escape = FALSE) %>% 
  kable_styling(position = "center")
WARNING FLAG
LOW_N_CELLS FALSE
HIGH_MITO TRUE
MANY_HIGH_MITO FALSE
HIGH_PROP_FILTERED FALSE
LOW_AVG_UMI FALSE
CC_ASSSIGNMENT TRUE
SMALL_CLUSTERS TRUE

11 Reproducibility

This document was last rendered on:

## 2020-11-02 20:42:29

The git repository and last commit:

{r repo, echo = FALSE, cache = FALSE}

git2r::repository(params$assets)

The R session info:

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] Seurat_3.2.1       cowplot_1.1.0      ggplot2_3.3.2      RColorBrewer_1.1-2
##  [5] kableExtra_1.3.0   stringr_1.4.0      DT_0.15            glue_1.4.2        
##  [9] data.table_1.13.0  readr_1.3.1        tidyr_1.1.2        dplyr_1.0.2       
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_1.4-1      deldir_0.1-29        
##   [4] ellipsis_0.3.1        ggridges_0.5.2        rstudioapi_0.11      
##   [7] spatstat.data_1.4-3   farver_2.0.3          leiden_0.3.3         
##  [10] listenv_0.8.0         bit64_0.9-7           ggrepel_0.8.2        
##  [13] RSpectra_0.16-0       xml2_1.3.2            codetools_0.2-16     
##  [16] splines_3.6.1         knitr_1.28            polyclip_1.10-0      
##  [19] jsonlite_1.6.1        ica_1.0-2             cluster_2.1.0        
##  [22] png_0.1-7             uwot_0.1.8            shiny_1.4.0.2        
##  [25] sctransform_0.2.1     compiler_3.6.1        httr_1.4.2           
##  [28] Matrix_1.2-17         fastmap_1.0.1         lazyeval_0.2.2       
##  [31] later_1.0.0           htmltools_0.4.0       tools_3.6.1          
##  [34] rsvd_1.0.3            igraph_1.2.6          gtable_0.3.0         
##  [37] RANN_2.6.1            reshape2_1.4.4        rappdirs_0.3.1       
##  [40] Rcpp_1.0.4.6          spatstat_1.64-1       vctrs_0.3.4          
##  [43] nlme_3.1-140          crosstalk_1.1.0.1     lmtest_0.9-38        
##  [46] xfun_0.14             globals_0.13.1        rvest_0.3.6          
##  [49] mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0      
##  [52] irlba_2.3.3           goftest_1.2-2         future_1.19.1        
##  [55] MASS_7.3-51.4         zoo_1.8-8             scales_1.1.1         
##  [58] hms_0.5.3             promises_1.1.1        spatstat.utils_1.17-0
##  [61] parallel_3.6.1        yaml_2.2.1            reticulate_1.17      
##  [64] pbapply_1.4-3         gridExtra_2.3         rpart_4.1-15         
##  [67] stringi_1.5.3         rlang_0.4.7           pkgconfig_2.0.3      
##  [70] matrixStats_0.56.0    evaluate_0.14         lattice_0.20-38      
##  [73] ROCR_1.0-11           purrr_0.3.4           tensor_1.5           
##  [76] labeling_0.3          patchwork_1.0.1       htmlwidgets_1.5.2    
##  [79] bit_1.1-15.2          tidyselect_1.1.0      RcppAnnoy_0.0.16     
##  [82] plyr_1.8.6            magrittr_1.5          R6_2.4.1             
##  [85] generics_0.0.2        mgcv_1.8-28           pillar_1.4.6         
##  [88] withr_2.2.0           fitdistrplus_1.1-1    survival_2.44-1.1    
##  [91] abind_1.4-5           tibble_3.0.3          future.apply_1.6.0   
##  [94] hdf5r_1.3.3           crayon_1.3.4          KernSmooth_2.23-15   
##  [97] plotly_4.9.2.1        rmarkdown_2.3         grid_3.6.1           
## [100] digest_0.6.25         webshot_0.5.2         xtable_1.8-4         
## [103] httpuv_1.5.2          munsell_0.5.0         viridisLite_0.3.0